{
 "cells": [
  {
   "cell_type": "code",
   "id": "initial_id",
   "metadata": {
    "collapsed": true,
    "ExecuteTime": {
     "end_time": "2025-01-08T14:32:29.686498Z",
     "start_time": "2025-01-08T14:32:29.680179Z"
    }
   },
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "#分组后给名称加前缀\n",
    "dict_obj = {'key1' : ['a', 'b', 'a', 'b',\n",
    "                      'a', 'b', 'a', 'a'],\n",
    "            'key2' : ['one', 'one', 'two', 'three',\n",
    "                      'two', 'two', 'one', 'three'],\n",
    "            'data1': np.random.randint(1, 10, 8),\n",
    "            'data2': np.random.randint(1, 10, 8)}\n",
    "df_obj = pd.DataFrame(dict_obj)\n",
    "print(df_obj)\n",
    "print('-'*50)"
   ],
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  key1   key2  data1  data2\n",
      "0    a    one      3      9\n",
      "1    b    one      8      2\n",
      "2    a    two      3      7\n",
      "3    b  three      1      4\n",
      "4    a    two      3      8\n",
      "5    b    two      8      3\n",
      "6    a    one      1      6\n",
      "7    a  three      6      8\n",
      "--------------------------------------------------\n"
     ]
    }
   ],
   "execution_count": 34
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-01-08T14:32:29.692745Z",
     "start_time": "2025-01-08T14:32:29.688501Z"
    }
   },
   "cell_type": "code",
   "source": "df_obj.info()",
   "id": "197d678babed83da",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 8 entries, 0 to 7\n",
      "Data columns (total 4 columns):\n",
      " #   Column  Non-Null Count  Dtype \n",
      "---  ------  --------------  ----- \n",
      " 0   key1    8 non-null      object\n",
      " 1   key2    8 non-null      object\n",
      " 2   data1   8 non-null      int32 \n",
      " 3   data2   8 non-null      int32 \n",
      "dtypes: int32(2), object(2)\n",
      "memory usage: 324.0+ bytes\n"
     ]
    }
   ],
   "execution_count": 35
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-01-08T14:32:29.698128Z",
     "start_time": "2025-01-08T14:32:29.693748Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# 按key1分组后，计算data1，data2的统计信息并附加到原始表格中，并添加表头前缀\n",
    "k1_sum = df_obj.groupby('key1').mean(numeric_only=True).add_prefix('mean_')\n",
    "print(k1_sum)"
   ],
   "id": "1c3de55de866a5c8",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "      mean_data1  mean_data2\n",
      "key1                        \n",
      "a       3.200000         7.6\n",
      "b       5.666667         3.0\n"
     ]
    }
   ],
   "execution_count": 36
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-01-08T14:32:29.704781Z",
     "start_time": "2025-01-08T14:32:29.699132Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# 方法2，使用transform，分组后计算结果和原本的df保持一致\n",
    "k1_sum_tf = df_obj.loc[:,['key1','data1', 'data2']].groupby('key1').transform('mean').add_prefix('mean_')\n",
    "k1_sum_tf\n",
    "# df_obj[k1_sum_tf.columns] = k1_sum_tf\n",
    "# print(df_obj)"
   ],
   "id": "c739a7d8300c08aa",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "   mean_data1  mean_data2\n",
       "0    3.200000         7.6\n",
       "1    5.666667         3.0\n",
       "2    3.200000         7.6\n",
       "3    5.666667         3.0\n",
       "4    3.200000         7.6\n",
       "5    5.666667         3.0\n",
       "6    3.200000         7.6\n",
       "7    3.200000         7.6"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>mean_data1</th>\n",
       "      <th>mean_data2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>3.200000</td>\n",
       "      <td>7.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>5.666667</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3.200000</td>\n",
       "      <td>7.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>5.666667</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>3.200000</td>\n",
       "      <td>7.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>5.666667</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>3.200000</td>\n",
       "      <td>7.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>3.200000</td>\n",
       "      <td>7.6</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 37
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-01-08T14:32:29.707783Z",
     "start_time": "2025-01-08T14:32:29.705785Z"
    }
   },
   "cell_type": "code",
   "source": "del df_obj['key2']",
   "id": "b316c9ea8b27e4c1",
   "outputs": [],
   "execution_count": 38
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-01-08T14:32:29.713867Z",
     "start_time": "2025-01-08T14:32:29.708786Z"
    }
   },
   "cell_type": "code",
   "source": "df_obj.groupby('key1').transform(np.mean)",
   "id": "34bef6efcfbb251b",
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\EAST\\AppData\\Local\\Temp\\ipykernel_17524\\2916770615.py:1: FutureWarning: The provided callable <function mean at 0x0000027E39432520> is currently using DataFrameGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string \"mean\" instead.\n",
      "  df_obj.groupby('key1').transform(np.mean)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "      data1  data2\n",
       "0  3.200000    7.6\n",
       "1  5.666667    3.0\n",
       "2  3.200000    7.6\n",
       "3  5.666667    3.0\n",
       "4  3.200000    7.6\n",
       "5  5.666667    3.0\n",
       "6  3.200000    7.6\n",
       "7  3.200000    7.6"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>data1</th>\n",
       "      <th>data2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>3.200000</td>\n",
       "      <td>7.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>5.666667</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3.200000</td>\n",
       "      <td>7.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>5.666667</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>3.200000</td>\n",
       "      <td>7.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>5.666667</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>3.200000</td>\n",
       "      <td>7.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>3.200000</td>\n",
       "      <td>7.6</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 39
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-01-08T14:32:29.718951Z",
     "start_time": "2025-01-08T14:32:29.713867Z"
    }
   },
   "cell_type": "code",
   "source": "df_obj",
   "id": "f911af23e777a7c6",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "  key1  data1  data2\n",
       "0    a      3      9\n",
       "1    b      8      2\n",
       "2    a      3      7\n",
       "3    b      1      4\n",
       "4    a      3      8\n",
       "5    b      8      3\n",
       "6    a      1      6\n",
       "7    a      6      8"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>key1</th>\n",
       "      <th>data1</th>\n",
       "      <th>data2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>a</td>\n",
       "      <td>3</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>b</td>\n",
       "      <td>8</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>a</td>\n",
       "      <td>3</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>b</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>a</td>\n",
       "      <td>3</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>b</td>\n",
       "      <td>8</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>a</td>\n",
       "      <td>1</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>a</td>\n",
       "      <td>6</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 40
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-01-08T14:32:29.727834Z",
     "start_time": "2025-01-08T14:32:29.722955Z"
    }
   },
   "cell_type": "code",
   "source": [
    "df_obj\n",
    "\n",
    "\n",
    "#实现a组和b组，谁比平均分高，谁比平均分低\n",
    "def diff_mean(s):\n",
    "    \"\"\"\n",
    "        返回数据与均值的差值，s传入的是某一个分组\n",
    "    \"\"\"\n",
    "    return s - s.mean()\n",
    "\n",
    "\n",
    "print(df_obj.groupby('key1').transform(diff_mean))"
   ],
   "id": "b601240304f63502",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "      data1  data2\n",
      "0 -0.200000    1.4\n",
      "1  2.333333   -1.0\n",
      "2 -0.200000   -0.6\n",
      "3 -4.666667    1.0\n",
      "4 -0.200000    0.4\n",
      "5  2.333333    0.0\n",
      "6 -2.200000   -1.6\n",
      "7  2.800000    0.4\n"
     ]
    }
   ],
   "execution_count": 41
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
